One interpretation of spectral clustering is that it seeks a partition of nodes which minimizes the probability that a random walker jumps between clusters. This is similar to RatioCut and similar to minimizing heat diffusion across clusters. In this notebook, we simulate a random walker and count the number of jumps between clusters.
from lib.spectral_clustering import spectral_clustering, laplacian_matrix, similarity_matrix
from lib.datasets import gaussian_mixture
from lib.kmeans import kmeans
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from matplotlib import cm
from tqdm import tqdm
n_gaussians = 5
n_pts = 4
n = n_pts * n_gaussians
d = 2
data = gaussian_mixture(n_gaussians, n_pts, d, centroid_var=10)
cmap = cm.get_cmap("tab20")
unif_colors = [cmap(intensity) for intensity in np.linspace(0, 1, n)]
data_graph = similarity_matrix(data)
transition_matrix = data_graph / np.sum(data_graph, axis=1).reshape((-1, 1))
assns = spectral_clustering(data, k=n_gaussians, lform="rw")
data_clusters = [ data[assns == clss].T for clss in range(n_gaussians) ]
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j], label=f"{j}")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Clustered Sample of Gaussian Mixture")
plt.legend()
plt.savefig("artifacts/random_walk_clusters/Clustered_Sample_Gaussian.png", dpi=120)
plt.show()
import numpy as np
def random_walks(transition_matrix, cluster_assns, iters=1000):
# transition_matrix is nxn graph transition matrix
# cluster_assns (n,) length vector assns
n = len(transition_matrix) # number of nodes
k = np.max(cluster_assns) + 1 # +1 to turn index to length
rw_assns = np.arange(n)
rw_cluster_assns = cluster_assns[rw_assns]
jump_freqs = np.zeros((k, k))
for i in range(iters):
jump_likelihoods = transition_matrix[rw_assns]
new_rw_assns = np.array([ np.random.choice(n, p=jump_likelihoods[i]) for i in range(n) ])
new_cluster_assns = cluster_assns[new_rw_assns]
cluster_jumps, counts = np.unique(np.stack((rw_cluster_assns, new_cluster_assns), axis=0), return_counts=True, axis=1)
for i in range(len(counts)):
r, c = cluster_jumps[:, i]
jump_freqs[r, c] += counts[i]
rw_cluster_assns = new_cluster_assns
rw_assns = new_rw_assns
return jump_freqs
jump_freqs = random_walks(transition_matrix, assns)
from matplotlib.ticker import MaxNLocator
plt.subplot(1, 2, 1)
plt.imshow(jump_freqs)
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Linear Scale")
plt.subplot(1, 2, 2)
plt.imshow(np.log(jump_freqs + 10e-6))
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
plt.title("Log Scale")
plt.suptitle("Transitions Between Clusters")
plt.gcf().set_size_inches(10,5)
plt.savefig("artifacts/random_walk_clusters/Transition Frequencies between Clusters.png", dpi=120)
plt.show()
def random_walks_plot(transition_matrix, data, iters=50):
# transition_matrix is nxn graph transition matrix
# cluster_assns (n,) length vector assns
n = len(transition_matrix) # number of nodes
rw_assns = np.arange(n)
walker = 0
for i in range(iters):
jump_likelihoods = transition_matrix[rw_assns]
plt.scatter(*data[walker], color="k",label="walker")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Clustered Sample of Gaussian Mixture")
plt.savefig("artifacts/random_walk_clusters/walker_gif_pngs/image_step{:004d}.png".format(i))
new_rw_assns = np.array([ np.random.choice(n, p=jump_likelihoods[i]) for i in range(n) ])
walker = new_rw_assns[0]
rw_assns = new_rw_assns
for j, data_cluster in enumerate(data_clusters):
plt.scatter(*data_cluster, color=unif_colors[j], label=f"{j}")
random_walks_plot(transition_matrix, data)